#导入模块
import pandas as pd
import numpy as np
#使用算法:逻辑回归,梯度下降分类,线性回归,k近邻分类,朴素贝叶斯
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
#创建特征列表表头
#column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
column_names = ['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst']
#使用pandas.read_csv函数从网上读取数据集
#data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',names=column_names)
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',names=column_names)
df.info()
2)诊断(M =恶性,B =良性)
3-32为每个细胞核计算十个实值特征:
df.head(10)
sn.countplot(df["diagnosis"],label='count')
## id 不用于classification,删去,diagnosis为分类标准,进行二值化处理(M=1,B=0)
# feature的名字并不需要了解 because I believe machine learning is awesome :)
# df['diagnosis'] = pd.get_dummies(df['diagnosis'],drop_first=True)
dia = df['diagnosis']
df.drop('id', axis=1,inplace=True)
df.drop('diagnosis',axis=1,inplace=True)
data = df
data.describe()
#构建散布矩阵(scatter matrix)
#通过散布矩阵可以看出在这个数据特征和其它特征中有关联性
pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (80,60), diagonal = 'kde');
#相似相关性热力图
f,ax = plt.subplots(figsize=(20, 20))
sn.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax,cmap='coolwarm')
#使用sklearn.cross_validation里的train_test_split模块分割数据集
from sklearn.cross_validation import train_test_split
#随机采样25%的数据用于测试,剩下的75%用于构建训练集
X_train,X_test,y_train,y_test = train_test_split(data,dia,test_size = 0.25,random_state = 33)
#查看训练样本的数量和类别分布
y_train.value_counts()
X_train.shape
y_train.shape
LR = LogisticRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_test)
print(metrics.classification_report(y_test,LR_pred, digits = 5))
#LR_cm=metrics.confusion_matrix(y_test,LR_pred)
#LR.score(X_test,y_test)
#normalization
x_train_N = (X_train-X_train.mean())/(X_train.max()-X_train.min())
x_test_N = (X_test-X_test.mean())/(X_test.max()-X_test.min())
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_N)
plt.figure(1, figsize=(14, 13))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')
#标准化操作,分离均值,标准差,极值
#data_std = (data - data.mean()) / (data.std())
data_mean = data_std.iloc[:,0:10]
data_se = data_std.iloc[:,10:20]
data_worst = data_std.iloc[:,20:30]
data_mean.shape
# 均值的标准化分布
data_mean_1 = pd.concat([data_mean,dia],axis=1)
data_mean_2 = pd.melt(data_mean_1,id_vars='diagnosis',var_name='features',value_name='value')
plt.figure(figsize=(20,10))
sn.violinplot(x='features', y='value', hue='diagnosis', data=data_mean_2,split=True,inner='quart')
# 标准差的分布
data_se_1 = pd.concat([data_se,dia],axis=1)
data_se_2 = pd.melt(data_se_1,id_vars='diagnosis',var_name='features',value_name='value')
plt.figure(figsize=(20,10))
sn.violinplot(x='features', y='value', hue='diagnosis', data=data_se_2,split=True,inner='quart')
# 极值分布
data_worst_1 = pd.concat([data_worst,dia],axis=1)
data_worst_2 = pd.melt(data_worst_1,id_vars='diagnosis',var_name='features',value_name='value')
plt.figure(figsize=(20,10))
sn.violinplot(x='features', y='value', hue='diagnosis', data=data_worst_2,split=True,inner='quart')